"""
Integration tests for dbt Cloud auto-discovery functionality.

Tests focus on:
- End-to-end auto-discovery workflow
- Multi-job ingestion and metadata aggregation
- Error handling across multiple jobs
- Comparison between explicit and auto-discovery modes
"""

from typing import Any, Dict
from unittest import mock

import pytest

from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.dbt.dbt_cloud import (
    AutoDiscoveryConfig,
    DBTCloudConfig,
    DBTCloudSource,
)
from datahub.ingestion.source.dbt.dbt_cloud_models import (
    DBTCloudDeploymentType,
    DBTCloudEnvironment,
    DBTCloudJob,
)


@pytest.fixture
def mock_graphql_response() -> Dict[str, Any]:
    """Sample GraphQL response for a job's nodes."""
    return {
        "job": {
            "models": [
                {
                    "uniqueId": "model.test_project.test_model",
                    "name": "test_model",
                    "description": "Test model description",
                    "resourceType": "model",
                    "database": "test_db",
                    "schema": "test_schema",
                    "alias": None,
                    "packageName": "test_project",
                    "columns": [],
                    "meta": {},
                    "tags": [],
                    "rawCode": "SELECT * FROM test_table",
                    "compiledCode": "SELECT * FROM test_table",
                    "tests": [],
                    "upstreamSources": [],
                    "materializedType": "table",
                    "status": "success",
                    "error": None,
                    "skip": False,
                    "dependsOn": [],
                }
            ],
            "sources": [],
            "seeds": [],
            "snapshots": [],
            "tests": [],
        }
    }


class TestAutoDiscoveryEndToEnd:
    """End-to-end tests for auto-discovery workflow."""

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_auto_discovery_multiple_jobs_success(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Should successfully ingest metadata from multiple discovered jobs."""
        # Setup: 2 jobs discovered in production environment
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=True),
            DBTCloudJob(id=200, generate_docs=True),
        ]
        mock_graphql.return_value = mock_graphql_response

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        nodes, additional_metadata = source.load_nodes()

        # Assertions
        # Should have called GraphQL for both jobs (models, sources, seeds, snapshots, tests = 5 calls per job)
        assert mock_graphql.call_count == 10  # 2 jobs * 5 node types

        # Verify both job IDs were queried
        job_ids_queried = {
            call[1]["variables"]["jobId"] for call in mock_graphql.call_args_list
        }
        assert job_ids_queried == {100, 200}

        # Verify run_id is None (always use latest in auto-discovery)
        for call in mock_graphql.call_args_list:
            assert call[1]["variables"]["runId"] is None

        # Verify nodes were collected (2 models from 2 jobs)
        assert len(nodes) == 2

        # Verify metadata contains account_id but not job_id (since multiple jobs)
        assert additional_metadata["account_id"] == "123456"
        assert "job_id" not in additional_metadata

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_auto_discovery_partial_job_failure(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Should continue ingesting other jobs when one job fails."""
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=True),
            DBTCloudJob(id=200, generate_docs=True),
            DBTCloudJob(id=300, generate_docs=True),
        ]

        # Job 200 fails, others succeed
        def graphql_side_effect(*args: Any, **kwargs: Any) -> Dict[str, Any]:
            job_id = kwargs["variables"]["jobId"]
            if job_id == 200:
                raise ValueError("GraphQL error for job 200")
            return mock_graphql_response

        mock_graphql.side_effect = graphql_side_effect

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        nodes, _ = source.load_nodes()

        # Should have nodes from jobs 100 and 300 only (2 models)
        assert len(nodes) == 2

        # Verify warning was logged for job 200 failure
        # (This is implicit - the source continues without raising)

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_auto_discovery_no_jobs_found(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
    ) -> None:
        """Should return empty results when no jobs are discovered."""
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=False),  # Filtered out
        ]

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        nodes, additional_metadata = source.load_nodes()

        # Should have no nodes and empty metadata when no jobs are found
        assert len(nodes) == 0
        assert additional_metadata == {}

        # Should not have called GraphQL
        mock_graphql.assert_not_called()


class TestExplicitModeComparison:
    """Tests comparing explicit mode behavior with auto-discovery mode."""

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    def test_explicit_mode_uses_configured_run_id(
        self,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Explicit mode should use the configured run_id."""
        mock_graphql.return_value = mock_graphql_response

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            job_id=12345678,
            run_id=999,  # Explicit run_id
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        source.load_nodes()

        # Verify run_id=999 was used in all GraphQL calls
        for call in mock_graphql.call_args_list:
            assert call[1]["variables"]["runId"] == 999

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_auto_discovery_ignores_run_id(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Auto-discovery mode should ignore run_id and use latest."""
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=True),
        ]
        mock_graphql.return_value = mock_graphql_response

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            run_id=999,  # Should be ignored in auto-discovery
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        source.load_nodes()

        # Verify run_id=None was used (latest run)
        for call in mock_graphql.call_args_list:
            assert call[1]["variables"]["runId"] is None

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_explicit_mode_metadata_includes_job_id(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Explicit mode should include job_id in additional metadata."""
        mock_graphql.return_value = mock_graphql_response

        # Explicit mode
        config_explicit = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            job_id=12345678,
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source_explicit = DBTCloudSource(config_explicit, ctx)

        _, metadata_explicit = source_explicit.load_nodes()

        # Auto-discovery mode
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=True),
        ]

        config_auto = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        source_auto = DBTCloudSource(config_auto, ctx)

        _, metadata_auto = source_auto.load_nodes()

        # Verify metadata differences
        # Note: Based on the diff, job_id was removed from additional_metadata
        # This test verifies the current behavior
        assert "account_id" in metadata_explicit
        assert "account_id" in metadata_auto


class TestMetadataConsistency:
    """Tests to ensure auto-discovery produces identical metadata to explicit mode."""

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_auto_discovery_produces_same_metadata_as_explicit_mode(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """
        Critical test: Auto-discovery should produce identical node metadata
        to explicit mode when processing the same job.
        """
        job_id = 12345678
        mock_graphql.return_value = mock_graphql_response

        # Explicit mode: directly specify job_id
        config_explicit = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            job_id=job_id,
            target_platform="snowflake",
        )
        ctx_explicit = PipelineContext(run_id="test-run-id", pipeline_name="test")
        source_explicit = DBTCloudSource(config_explicit, ctx_explicit)
        nodes_explicit, metadata_explicit = source_explicit.load_nodes()

        # Reset mock call count
        mock_graphql.reset_mock()

        # Auto-discovery mode: discover the same job
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=job_id, generate_docs=True),
        ]

        config_auto = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx_auto = PipelineContext(run_id="test-run-id", pipeline_name="test")
        source_auto = DBTCloudSource(config_auto, ctx_auto)
        nodes_auto, metadata_auto = source_auto.load_nodes()

        # Verify both modes produce the same nodes
        assert len(nodes_explicit) == len(nodes_auto)
        assert len(nodes_explicit) == 1  # One model from our mock response

        # Compare node metadata field by field
        node_explicit = nodes_explicit[0]
        node_auto = nodes_auto[0]

        # These fields should be identical
        assert node_explicit.name == node_auto.name
        assert node_explicit.database == node_auto.database
        assert node_explicit.schema == node_auto.schema
        assert node_explicit.dbt_name == node_auto.dbt_name
        assert node_explicit.dbt_adapter == node_auto.dbt_adapter
        assert node_explicit.node_type == node_auto.node_type
        assert node_explicit.description == node_auto.description
        assert node_explicit.raw_code == node_auto.raw_code
        assert node_explicit.compiled_code == node_auto.compiled_code
        assert node_explicit.upstream_nodes == node_auto.upstream_nodes
        assert node_explicit.materialization == node_auto.materialization

        # Additional metadata should contain account_id in both cases
        assert metadata_explicit["account_id"] == metadata_auto["account_id"]


class TestAutoDiscoveryWithPatterns:
    """Tests for job filtering with allow/deny patterns."""

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_job_pattern_filtering(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Should only ingest jobs matching the job_id_pattern."""
        from datahub.configuration.common import AllowDenyPattern

        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=1001, generate_docs=True),  # Matches ^100.*
            DBTCloudJob(id=1002, generate_docs=True),  # Matches ^100.*
            DBTCloudJob(id=2001, generate_docs=True),  # Doesn't match
        ]
        mock_graphql.return_value = mock_graphql_response

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(
                enabled=True,
                job_id_pattern=AllowDenyPattern(allow=["^100.*"]),
            ),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        source.load_nodes()

        # Should only query jobs 1001 and 1002
        job_ids_queried = {
            call[1]["variables"]["jobId"] for call in mock_graphql.call_args_list
        }
        assert 1001 in job_ids_queried
        assert 1002 in job_ids_queried
        assert 2001 not in job_ids_queried

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_deny_pattern_filtering(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
        mock_graphql_response: Dict[str, Any],
    ) -> None:
        """Should exclude jobs matching deny pattern."""
        from datahub.configuration.common import AllowDenyPattern

        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=True),
            DBTCloudJob(id=200, generate_docs=True),
            DBTCloudJob(id=300, generate_docs=True),
        ]
        mock_graphql.return_value = mock_graphql_response

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(
                enabled=True,
                job_id_pattern=AllowDenyPattern(deny=["200"]),  # Exclude job 200
            ),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        source.load_nodes()

        # Should query jobs 100 and 300, but not 200
        job_ids_queried = {
            call[1]["variables"]["jobId"] for call in mock_graphql.call_args_list
        }
        assert 100 in job_ids_queried
        assert 300 in job_ids_queried
        assert 200 not in job_ids_queried


class TestAutoDiscoveryErrorHandling:
    """Tests for error handling in auto-discovery mode."""

    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_environment_api_failure_propagates(self, mock_get_envs: mock.Mock) -> None:
        """Should fail fast when environment API fails."""
        mock_get_envs.side_effect = ValueError("Environment API error")

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Should raise the error
        with pytest.raises(ValueError, match="Environment API error"):
            source.load_nodes()

    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_jobs_api_failure_propagates(
        self, mock_get_envs: mock.Mock, mock_get_jobs: mock.Mock
    ) -> None:
        """Should fail fast when jobs API fails."""
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.side_effect = ValueError("Jobs API error")

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Should raise the error
        with pytest.raises(ValueError, match="Jobs API error"):
            source.load_nodes()

    @mock.patch.object(DBTCloudSource, "_send_graphql_query")
    @mock.patch.object(DBTCloudSource, "_get_jobs_for_project")
    @mock.patch.object(DBTCloudSource, "_get_environments_for_project")
    def test_all_jobs_fail_returns_empty(
        self,
        mock_get_envs: mock.Mock,
        mock_get_jobs: mock.Mock,
        mock_graphql: mock.Mock,
    ) -> None:
        """Should return empty when all jobs fail to fetch nodes."""
        mock_get_envs.return_value = [
            DBTCloudEnvironment(id=1, deployment_type=DBTCloudDeploymentType.PRODUCTION)
        ]
        mock_get_jobs.return_value = [
            DBTCloudJob(id=100, generate_docs=True),
            DBTCloudJob(id=200, generate_docs=True),
        ]
        mock_graphql.side_effect = ValueError("GraphQL error")

        config = DBTCloudConfig(
            access_url="https://test.getdbt.com",
            token="dummy_token",
            account_id=123456,
            project_id=1234567,
            auto_discovery=AutoDiscoveryConfig(enabled=True),
            target_platform="snowflake",
        )
        ctx = PipelineContext(run_id="test-run-id", pipeline_name="test-pipeline")
        source = DBTCloudSource(config, ctx)

        # Execute
        nodes, _ = source.load_nodes()

        # Should have no nodes
        assert len(nodes) == 0
